{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "import sys,random\n",
    "import joblib\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split \n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('featurescore'):\n",
    "    os.mkdir('featurescore')\n",
    "if not os.path.exists('model'):\n",
    "    os.mkdir('model')\n",
    "if not os.path.exists('preds'):\n",
    "    os.mkdir('preds')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "input_dir = '../../preprocess_data_new/'\n",
    "data_date = joblib.load(input_dir + 'train_ax_date.lz4')[:33465]\n",
    "data_nodup =joblib.load(input_dir + 'train_ax_nodup.lz4').drop(columns=['id','loan_dt'])[:33465]\n",
    "data_null = joblib.load(input_dir + 'train_ax_null.lz4')[:33465]\n",
    "# 为了迁就valid中tag特征append到最后的格式\n",
    "data_tag = data_nodup[['tag']] \n",
    "data_nodup = data_nodup.drop(columns=['tag'])\n",
    "data = pd.concat([data_date,data_nodup,data_null,data_tag],axis=1)\n",
    "\n",
    "x = data.fillna(-1).values\n",
    "data_label = joblib.load(input_dir + 'train_y_33465.lz4')\n",
    "y = data_label['label'].values\n",
    "feature_names = list(data.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(33465, 4807)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 注意:data_nodup发生了一些变化\n",
    "\n",
    "代码整理之前：使用旧data_nodup（近似data_raw）,拼接后数据维度为：6702     \n",
    "代码整理之前后：使用新data_nodup,拼接后数据维度为：4807"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 本地验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#1.分割数据\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "#2.建立模型\n",
    "params={\n",
    "    'booster':'gbtree',\n",
    "    'objective': 'binary:logistic',\n",
    "    'early_stopping_rounds':100,\n",
    "    'scale_pos_weight': float(len(data_label)-np.sum(data_label.values))/float(np.sum(data_label.values)),  # 负例样本除以正例样本\n",
    "    'eval_metric': 'auc',\n",
    "    'gamma':1,\n",
    "    'max_depth':6,\n",
    "    'lambda':1,\n",
    "    'subsample':0.9,\n",
    "    'colsample_bytree':0.9,\n",
    "    'min_child_weight':1, \n",
    "    'eta': 0.04,\n",
    "    'seed':2010,\n",
    "    'nthread':32\n",
    "        }\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "开始训练！\n",
      "[0]\ttrain-auc:0.824189\n",
      "[50]\ttrain-auc:0.958696\n",
      "[100]\ttrain-auc:0.986746\n",
      "[150]\ttrain-auc:0.995115\n",
      "[200]\ttrain-auc:0.998245\n",
      "[250]\ttrain-auc:0.999418\n",
      "[299]\ttrain-auc:0.999776\n"
     ]
    }
   ],
   "source": [
    "#3.训练模型\n",
    "dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=feature_names)\n",
    "watchlist  = [(dtrain,'train')]\n",
    "print('开始训练！')\n",
    "model  = xgb.train(params,dtrain, num_boost_round=300, evals =watchlist, verbose_eval=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "开始预测！\n",
      "AUC: 0.8289777507133729\n"
     ]
    }
   ],
   "source": [
    "# 4.预测结果\n",
    "print('开始预测！')\n",
    "dtest = xgb.DMatrix(x_test, feature_names=feature_names)\n",
    "y_pre = model.predict(dtest)\n",
    "auc = metrics.roc_auc_score(y_test, y_pre)\n",
    "print('AUC:',auc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1.(整理代码之前，结果不能复现)使用data_raw（这里的no_dup较之data_raw，减少了100多维）作为主干特征\n",
    "#### 用于反映不同特征组合的auc变化，不和第2点作比较，因为随机数种子不一样\n",
    "raw AUC: 0.8130921791239684  \n",
    "raw 去除nan列 AUC: 0.8125236195981279  \n",
    "raw 去除nan列 + 统计null AUC: 0.8263001992035767   \n",
    "nodup + null AUC: 0.8261527677046496  \n",
    "nodup + null + tag AUC:0.8296696198580618 \n",
    " \n",
    "nodup + null + tag (前4000维)AUC: 0.8251877022906782    \n",
    "nodup + null + tag (前5990维) AUC: 0.821991126741565  \n",
    "nodup + null + tag (前3000维) AUC: 0.8275583356303323  \n",
    "nodup + null + tag (前2000维) AUC: 0.8249837418081847\n",
    "\n",
    "nodup + null + tag (PCA降至8维) AUC: 0.5830606257872517  \n",
    "nodup + null + tag (fillna(-1))AUC: 0.8313812914152184  \n",
    "\n",
    "#### 2.使用data_nodup作为主干特征\n",
    "nodup + null + tag (fillna(-1))： AUC: 0.8289777507133729"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练与预测"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1.读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "valid_date = joblib.load(input_dir + 'valid_date.csv')\n",
    "valid_nodup = joblib.load(input_dir + 'valid_nodup.lz4').drop(columns=['loan_dt'])\n",
    "valid_null = joblib.load(input_dir + 'valid_row_null.csv').drop(columns=['id'])\n",
    "valid_tag = pd.read_csv('../../2.explore_data/6.explore_tag/predict_tag/valid_tag.csv',usecols=['tag'])\n",
    "valid_id = valid_nodup['id'].values\n",
    "valid_nodup = valid_nodup.drop(columns=['id'])\n",
    "\n",
    "valid = pd.concat([valid_date,valid_nodup,valid_null,valid_tag],axis=1)\n",
    "\n",
    "dtrain = xgb.DMatrix(data.fillna(-1).values, data_label.values, feature_names=list(data.columns))\n",
    "dtest = xgb.DMatrix(valid.fillna(-1).values, feature_names=list(valid.columns))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2.训练并保存模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):\n",
    "    if max_depth==6:\n",
    "        num_boost_round = 500\n",
    "    elif max_depth==7:\n",
    "        num_boost_round = 400\n",
    "    elif max_depth==8:\n",
    "        num_boost_round = 300\n",
    "    \n",
    "    params={\n",
    "    'booster':'gbtree',\n",
    "    'objective': 'binary:logistic',\n",
    "    'early_stopping_rounds':100,\n",
    "    'scale_pos_weight': float(len(data_label)-np.sum(data_label.values))/float(np.sum(data_label.values)),  # 负例样本除以正例样本\n",
    "    'eval_metric': 'auc',\n",
    "    'gamma':gamma,\n",
    "    'max_depth':max_depth,\n",
    "    'lambda':lambd,\n",
    "    'subsample':subsample,\n",
    "    'colsample_bytree':colsample_bytree,\n",
    "    'min_child_weight':min_child_weight, \n",
    "    'eta': 0.04,\n",
    "    'seed':random_seed,\n",
    "    'nthread':16\n",
    "        }\n",
    "    watchlist  = [(dtrain,'train')]\n",
    "    model = xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=watchlist)\n",
    "    model.save_model('./model/xgb{0}.model'.format(iteration))\n",
    "    \n",
    "    #predict test set\n",
    "    test_y = model.predict(dtest)\n",
    "    test_result = pd.DataFrame(test_id,columns=[\"id\"])\n",
    "    test_result['score'] = test_y\n",
    "    test_result.to_csv(\"./preds/xgb{0}.csv\".format(iteration),index=None,encoding='utf-8')\n",
    "    \n",
    "    #get feature score\n",
    "    feature_score = model.get_fscore()\n",
    "    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)\n",
    "    fs = []\n",
    "    for (key,value) in feature_score:\n",
    "        fs.append(\"{0},{1}\\n\".format(key,value))\n",
    "    \n",
    "    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:\n",
    "        f.writelines(\"feature,score\\n\")\n",
    "        f.writelines(fs)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "random_seed = list(range(1000,2000,20))\n",
    "gamma = [i/1000.0 for i in range(100,200,2)]\n",
    "max_depth = [6,7,8]\n",
    "lambd = list(range(200,400,2))\n",
    "subsample = [i/1000.0 for i in range(600,700,2)]\n",
    "colsample_bytree = [i/1000.0 for i in range(250,350,2)]\n",
    "min_child_weight = [i/1000.0 for i in range(200,300,2)]\n",
    "random.shuffle(random_seed)\n",
    "random.shuffle(gamma)\n",
    "random.shuffle(max_depth)\n",
    "random.shuffle(lambd)\n",
    "random.shuffle(subsample)\n",
    "random.shuffle(colsample_bytree)\n",
    "random.shuffle(min_child_weight)\n",
    "\n",
    "with open('params.pkl','wb') as f:\n",
    "    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)\n",
    "\n",
    "#to reproduce my result, uncomment following lines\n",
    "\"\"\"\n",
    "with open('params_for_reproducing.pkl','rb') as f:\n",
    "    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    \n",
    "\"\"\"\n",
    "\n",
    "for i in range(36):\n",
    "    pipeline(dtrain,dtest,valid_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 线上valid-auc分数\n",
    "raw 去除nan列 + 统计null AUC: 0.82751113123698  \n",
    "nodup + null + tag AUC: 0.82803008109167  \n",
    "nodup + null + tag（rank融合）AUC:0.8279914450872  \n",
    "nodup + null + tag (fillna(-1)) AUC:0.82979480823375"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
